library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DT)
neon_jgi_pilot_ind_bins <- read_csv("neon-jgi_pilot_ind_assembly_bins.csv") %>%
# out taxa categories in separate columns
# IMG only reports to the species level
rename(`Completeness` = `Bin Completeness`) %>%
rename(`Contamination` = `Bin Contamination`) %>%
rename(`Site` = `Genome Name`) %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "d__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "p__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "c__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "o__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "f__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "g__", "") %>%
# mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "s__", "") %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus"), "; ") %>%
# Simplify Site name
mutate_at("Site", str_replace, "Soil microbial communities from ", "") %>%
separate(`Site`, c("Site","Sample Name"), " - ") %>%
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-",)
## Rows: 1130 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date (1): Date Added
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 6 pieces. Additional pieces discarded in 21 rows [92, 131, 132, 228,
## 252, 575, 576, 678, 723, 730, 824, 825, 826, 827, 828, 891, 925, 946, 981, 996,
## ...].
## Warning: Expected 6 pieces. Missing pieces filled with `NA` in 282 rows [39, 40, 41, 57,
## 59, 64, 65, 66, 93, 94, 133, 145, 147, 148, 149, 172, 176, 181, 186, 187, ...].
write_tsv(neon_jgi_pilot_ind_bins, "neon_jgi_pilot_ind_bins.tsv")
datatable(neon_jgi_pilot_ind_bins %>%
group_by(Phylum) %>%
summarise(n = n()) %>%
mutate(freq = 100 * n / sum(n)) %>%
mutate_if(is.numeric, round, 1)
)
datatable(neon_jgi_pilot_ind_bins %>%
group_by(`IMG Genome ID`, Site) %>%
summarise(n = n())
)
## `summarise()` has grouped output by 'IMG Genome ID'. You can override using the
## `.groups` argument.
datatable(neon_jgi_pilot_ind_bins %>%
group_by(Site) %>%
summarise(n = n())
)
#### Class bar chart
neon_jgi_pilot_ind_bins %>%
ggplot(aes(x = `Total Number of Bases`)) +
geom_histogram(colour = "black", fill = "maroon", binwidth=500000) +
ggtitle("Genome size of MAGs") +
ylab("Genome size") +
theme(text = element_text(size = 20, color="black"))
# theme(axis.text.x = element_text(angle = 90))
#### Class Count Bar chart by site
#### Site Count Bar chart by phylum
#### Site Count Bar chart by Class